Topics:
install.packages("robustbase")
install.packages("sampleSelection")
install.packages("margins")
install.packages("tmap")
library(robustbase)
library(sampleSelection)
library(readxl)
library(tidyverse)
library(olsrr)
library(sandwich)
library(lmtest)
library(margins)
library(tmap)
Registered S3 method overwritten by 'htmlwidgets':
method from
print.htmlwidget tools:rstudio
library(sf)
Linking to GEOS 3.9.1, GDAL 3.3.0, PROJ 7.2.0; sf_use_s2() is TRUE
pzn_rent <- read_excel("data/rent-poznan.xlsx")
set.seed(123)
pzn_rent_subset <- pzn_rent %>%
add_count(quarter, name = "quarter_count") %>%
filter(quarter_count >= 50) %>%
filter(price >= 500, price <= 15000, flat_area >= 15, flat_area <= 250) %>%
sample_n(3000)
pzn_rent_subset
model_pzn <- lm(formula = price ~ flat_area + flat_rooms + individual + flat_furnished +
flat_for_students + flat_balcony,
data = pzn_rent_subset)
summary(model_pzn)
Call:
lm(formula = price ~ flat_area + flat_rooms + individual + flat_furnished +
flat_for_students + flat_balcony, data = pzn_rent_subset)
Residuals:
Min 1Q Median 3Q Max
-5043.1 -276.5 -53.7 223.2 9772.2
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 553.5873 30.9729 17.873 < 2e-16 ***
flat_area 20.7297 0.8218 25.226 < 2e-16 ***
flat_rooms 85.4827 19.8450 4.308 1.70e-05 ***
individualTRUE 24.8742 25.6332 0.970 0.33193
flat_furnishedTRUE 138.6813 25.5310 5.432 6.02e-08 ***
flat_for_studentsTRUE -169.2845 25.6333 -6.604 4.71e-11 ***
flat_balconyTRUE 64.1760 20.6216 3.112 0.00188 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 544.6 on 2993 degrees of freedom
Multiple R-squared: 0.4227, Adjusted R-squared: 0.4215
F-statistic: 365.2 on 6 and 2993 DF, p-value: < 2.2e-16
plot(model_pzn)
summary(model_pzn_rob)
Call:
robustbase::lmrob(formula = price ~ flat_area + flat_rooms + individual +
flat_furnished + flat_for_students + flat_balcony, data = pzn_rent_subset,
method = "MM")
\--> method = "MM"
Residuals:
Min 1Q Median 3Q Max
-3705.20 -217.14 -13.43 255.22 9774.74
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 752.428 27.813 27.053 < 2e-16 ***
flat_area 12.665 1.162 10.902 < 2e-16 ***
flat_rooms 139.919 19.646 7.122 1.33e-12 ***
individualTRUE 41.007 16.476 2.489 0.0129 *
flat_furnishedTRUE 84.644 17.560 4.820 1.51e-06 ***
flat_for_studentsTRUE -85.358 15.842 -5.388 7.67e-08 ***
flat_balconyTRUE 74.079 14.368 5.156 2.69e-07 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Robust residual standard error: 344.6
Multiple R-squared: 0.4267, Adjusted R-squared: 0.4255
Convergence in 20 IRWLS iterations
Robustness weights:
53 observations c(176,198,199,208,260,284,411,540,601,629,664,684,698,723,743,761,786,851,909,943,1242,1277,1328,1350,1358,1379,1449,1460,1609,1622,1841,1889,1898,1900,2062,2080,2120,2258,2297,2301,2400,2424,2445,2463,2496,2557,2625,2663,2749,2816,2883,2909,2975)
are outliers with |weight| = 0 ( < 3.3e-05);
240 weights are ~= 1. The remaining 2707 ones are summarized as
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.0000354 0.8664000 0.9523000 0.8848000 0.9856000 0.9990000
Algorithmic parameters:
tuning.chi bb tuning.psi refine.tol rel.tol
1.548e+00 5.000e-01 4.685e+00 1.000e-07 1.000e-07
scale.tol solve.tol eps.outlier eps.x warn.limit.reject
1.000e-10 1.000e-07 3.333e-05 4.547e-10 5.000e-01
warn.limit.meanrw
5.000e-01
nResample max.it best.r.s k.fast.s k.max maxit.scale trace.lev
500 50 2 1 200 200 0
mts compute.rd fast.s.large.n
1000 0 2000
psi subsampling cov compute.outlier.stats
"bisquare" "nonsingular" ".vcov.avar1" "SM"
seed : int(0)
plot(model_pzn_rob$model$price, model_pzn_rob$rweights)
Sample selection models:
basic selection model: Heckman’s model more advanced econometric models: copula selection models more statistical approach: not missing at random models
Generate sample data
set.seed(123)
n <- 1000
x1 <- rnorm(n = n, mean = 0, sd = 1)
x2 <- rnorm(n = n, mean = 0, sd = 1)
errors <- MASS::mvrnorm(n = n, mu = c(0, 0),
Sigma = matrix(c(1, 0.5, 0.5, 2), nrow=2),
empirical = T)
selmodel <- data.frame(r = 1 + x1 + x2 + errors[, 1],
y = 1 + x1 + errors[, 2])
selmodel$sel <- selmodel$r > 0
selection(selection = sel ~ x1 + x2,outcome = y ~ x1, data = selmodel) |>
summary()
Spatial
df_salaries <- read_excel("data/data-salaries.xlsx", sheet = 2) %>%
dplyr::select(id = Kod, name = Nazwa, salaries = Wartosc)
df_real <- read_excel("data/data-real-estate.xlsx", sheet = 2) %>%
dplyr::select(id = Kod, name = Nazwa, real = Wartosc)
df_model <- df_salaries %>%
inner_join(df_real) %>%
filter(real > 0)
Joining, by = c("id", "name")
plot((df_model$salaries), (df_model$real))
model1 <- lm(formula = log(real) ~ log(salaries), data = df_model)
df_model$resids <- resid(model1)
plot(model1)
powiats %>%
dplyr::select(id=jpt_kod_je) %>%
left_join(df_model %>%
mutate(id = substr(id, 1,4))) -> for_plot
Joining, by = "id"
tm_shape(for_plot) +
tm_polygons(col = "resids", style = "jenks") +
tm_shape(woj) +
tm_borders(lwd = 2)
Warning: The projection of the shape object for_plot is not known, while it seems to be projected.
Warning: Current projection of shape for_plot unknown and cannot be determined.
Warning: Current projection of shape woj unknown and cannot be determined.
Variable(s) "resids" contains positive and negative values, so midpoint is set to 0. Set midpoint = NA to show the full spectrum of the color palette.
Generating correlated data
library(MASS)
m <- 2
sigma <- diag(c(1,2), 2, 2)
sigma[1,2] <- sigma[2,1] <- 0.5
fake_data <- MASS::mvrnorm(n = 2000, mu=rep(0, m), Sigma = sigma, empirical = T)
cor(fake_data)
plot(fake_data)